library(leaflet)
library(ggplot2)PM566 HW-1
Step 1
PM_2002 <- read.csv("~/Downloads/ad_viz_plotval_data (1).csv")
PM_2022 <- read.csv("~/Downloads/ad_viz_plotval_data.csv")#Check dimension
dim(PM_2002)[1] 15976 22
#Check headers and footers
head(PM_2002) Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units
1 01/05/2002 AQS 60010007 1 25.1 ug/m3 LC
2 01/06/2002 AQS 60010007 1 31.6 ug/m3 LC
3 01/08/2002 AQS 60010007 1 21.4 ug/m3 LC
4 01/11/2002 AQS 60010007 1 25.9 ug/m3 LC
5 01/14/2002 AQS 60010007 1 34.5 ug/m3 LC
6 01/17/2002 AQS 60010007 1 41.0 ug/m3 LC
Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1 81 Livermore 1 100
2 93 Livermore 1 100
3 74 Livermore 1 100
4 82 Livermore 1 100
5 98 Livermore 1 100
6 115 Livermore 1 100
AQS.Parameter.Code AQS.Parameter.Description Method.Code
1 88101 PM2.5 - Local Conditions 120
2 88101 PM2.5 - Local Conditions 120
3 88101 PM2.5 - Local Conditions 120
4 88101 PM2.5 - Local Conditions 120
5 88101 PM2.5 - Local Conditions 120
6 88101 PM2.5 - Local Conditions 120
Method.Description CBSA.Code
1 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
2 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
3 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
4 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
5 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
6 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
CBSA.Name State.FIPS.Code State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA 6 California 1
2 San Francisco-Oakland-Hayward, CA 6 California 1
3 San Francisco-Oakland-Hayward, CA 6 California 1
4 San Francisco-Oakland-Hayward, CA 6 California 1
5 San Francisco-Oakland-Hayward, CA 6 California 1
6 San Francisco-Oakland-Hayward, CA 6 California 1
County Site.Latitude Site.Longitude
1 Alameda 37.68753 -121.7842
2 Alameda 37.68753 -121.7842
3 Alameda 37.68753 -121.7842
4 Alameda 37.68753 -121.7842
5 Alameda 37.68753 -121.7842
6 Alameda 37.68753 -121.7842
tail(PM_2002) Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units
15971 12/10/2002 AQS 61131003 1 15 ug/m3 LC
15972 12/13/2002 AQS 61131003 1 15 ug/m3 LC
15973 12/22/2002 AQS 61131003 1 1 ug/m3 LC
15974 12/25/2002 AQS 61131003 1 23 ug/m3 LC
15975 12/28/2002 AQS 61131003 1 5 ug/m3 LC
15976 12/31/2002 AQS 61131003 1 6 ug/m3 LC
Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
15971 62 Woodland-Gibson Road 1 100
15972 62 Woodland-Gibson Road 1 100
15973 6 Woodland-Gibson Road 1 100
15974 77 Woodland-Gibson Road 1 100
15975 28 Woodland-Gibson Road 1 100
15976 33 Woodland-Gibson Road 1 100
AQS.Parameter.Code AQS.Parameter.Description Method.Code
15971 88101 PM2.5 - Local Conditions 117
15972 88101 PM2.5 - Local Conditions 117
15973 88101 PM2.5 - Local Conditions 117
15974 88101 PM2.5 - Local Conditions 117
15975 88101 PM2.5 - Local Conditions 117
15976 88101 PM2.5 - Local Conditions 117
Method.Description CBSA.Code
15971 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15972 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15973 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15974 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15975 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15976 R & P Model 2000 PM2.5 Sampler w/WINS 40900
CBSA.Name State.FIPS.Code State
15971 Sacramento--Roseville--Arden-Arcade, CA 6 California
15972 Sacramento--Roseville--Arden-Arcade, CA 6 California
15973 Sacramento--Roseville--Arden-Arcade, CA 6 California
15974 Sacramento--Roseville--Arden-Arcade, CA 6 California
15975 Sacramento--Roseville--Arden-Arcade, CA 6 California
15976 Sacramento--Roseville--Arden-Arcade, CA 6 California
County.FIPS.Code County Site.Latitude Site.Longitude
15971 113 Yolo 38.66121 -121.7327
15972 113 Yolo 38.66121 -121.7327
15973 113 Yolo 38.66121 -121.7327
15974 113 Yolo 38.66121 -121.7327
15975 113 Yolo 38.66121 -121.7327
15976 113 Yolo 38.66121 -121.7327
#Check variable names and types
names(PM_2002) [1] "Date" "Source"
[3] "Site.ID" "POC"
[5] "Daily.Mean.PM2.5.Concentration" "Units"
[7] "Daily.AQI.Value" "Local.Site.Name"
[9] "Daily.Obs.Count" "Percent.Complete"
[11] "AQS.Parameter.Code" "AQS.Parameter.Description"
[13] "Method.Code" "Method.Description"
[15] "CBSA.Code" "CBSA.Name"
[17] "State.FIPS.Code" "State"
[19] "County.FIPS.Code" "County"
[21] "Site.Latitude" "Site.Longitude"
str(PM_2002)'data.frame': 15976 obs. of 22 variables:
$ Date : chr "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
$ Source : chr "AQS" "AQS" "AQS" "AQS" ...
$ Site.ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
$ POC : int 1 1 1 1 1 1 1 1 1 1 ...
$ Daily.Mean.PM2.5.Concentration: num 25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
$ Units : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
$ Daily.AQI.Value : int 81 93 74 82 98 115 89 62 69 107 ...
$ Local.Site.Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
$ Daily.Obs.Count : int 1 1 1 1 1 1 1 1 1 1 ...
$ Percent.Complete : num 100 100 100 100 100 100 100 100 100 100 ...
$ AQS.Parameter.Code : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
$ AQS.Parameter.Description : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
$ Method.Code : int 120 120 120 120 120 120 120 120 120 120 ...
$ Method.Description : chr "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" ...
$ CBSA.Code : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
$ CBSA.Name : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
$ State.FIPS.Code : int 6 6 6 6 6 6 6 6 6 6 ...
$ State : chr "California" "California" "California" "California" ...
$ County.FIPS.Code : int 1 1 1 1 1 1 1 1 1 1 ...
$ County : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
$ Site.Latitude : num 37.7 37.7 37.7 37.7 37.7 ...
$ Site.Longitude : num -122 -122 -122 -122 -122 ...
dim(PM_2022)[1] 59918 22
head(PM_2022) Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units
1 01/01/2022 AQS 60010007 3 12.7 ug/m3 LC
2 01/02/2022 AQS 60010007 3 13.9 ug/m3 LC
3 01/03/2022 AQS 60010007 3 7.1 ug/m3 LC
4 01/04/2022 AQS 60010007 3 3.7 ug/m3 LC
5 01/05/2022 AQS 60010007 3 4.2 ug/m3 LC
6 01/06/2022 AQS 60010007 3 3.8 ug/m3 LC
Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1 58 Livermore 1 100
2 60 Livermore 1 100
3 39 Livermore 1 100
4 21 Livermore 1 100
5 23 Livermore 1 100
6 21 Livermore 1 100
AQS.Parameter.Code AQS.Parameter.Description Method.Code
1 88101 PM2.5 - Local Conditions 170
2 88101 PM2.5 - Local Conditions 170
3 88101 PM2.5 - Local Conditions 170
4 88101 PM2.5 - Local Conditions 170
5 88101 PM2.5 - Local Conditions 170
6 88101 PM2.5 - Local Conditions 170
Method.Description CBSA.Code
1 Met One BAM-1020 Mass Monitor w/VSCC 41860
2 Met One BAM-1020 Mass Monitor w/VSCC 41860
3 Met One BAM-1020 Mass Monitor w/VSCC 41860
4 Met One BAM-1020 Mass Monitor w/VSCC 41860
5 Met One BAM-1020 Mass Monitor w/VSCC 41860
6 Met One BAM-1020 Mass Monitor w/VSCC 41860
CBSA.Name State.FIPS.Code State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA 6 California 1
2 San Francisco-Oakland-Hayward, CA 6 California 1
3 San Francisco-Oakland-Hayward, CA 6 California 1
4 San Francisco-Oakland-Hayward, CA 6 California 1
5 San Francisco-Oakland-Hayward, CA 6 California 1
6 San Francisco-Oakland-Hayward, CA 6 California 1
County Site.Latitude Site.Longitude
1 Alameda 37.68753 -121.7842
2 Alameda 37.68753 -121.7842
3 Alameda 37.68753 -121.7842
4 Alameda 37.68753 -121.7842
5 Alameda 37.68753 -121.7842
6 Alameda 37.68753 -121.7842
tail(PM_2022) Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units
59913 12/01/2022 AQS 61131003 1 3.4 ug/m3 LC
59914 12/07/2022 AQS 61131003 1 3.8 ug/m3 LC
59915 12/13/2022 AQS 61131003 1 6.0 ug/m3 LC
59916 12/19/2022 AQS 61131003 1 34.8 ug/m3 LC
59917 12/25/2022 AQS 61131003 1 23.2 ug/m3 LC
59918 12/31/2022 AQS 61131003 1 1.0 ug/m3 LC
Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
59913 19 Woodland-Gibson Road 1 100
59914 21 Woodland-Gibson Road 1 100
59915 33 Woodland-Gibson Road 1 100
59916 99 Woodland-Gibson Road 1 100
59917 77 Woodland-Gibson Road 1 100
59918 6 Woodland-Gibson Road 1 100
AQS.Parameter.Code AQS.Parameter.Description Method.Code
59913 88101 PM2.5 - Local Conditions 145
59914 88101 PM2.5 - Local Conditions 145
59915 88101 PM2.5 - Local Conditions 145
59916 88101 PM2.5 - Local Conditions 145
59917 88101 PM2.5 - Local Conditions 145
59918 88101 PM2.5 - Local Conditions 145
Method.Description CBSA.Code
59913 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59914 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59915 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59916 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59917 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59918 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
CBSA.Name State.FIPS.Code State
59913 Sacramento--Roseville--Arden-Arcade, CA 6 California
59914 Sacramento--Roseville--Arden-Arcade, CA 6 California
59915 Sacramento--Roseville--Arden-Arcade, CA 6 California
59916 Sacramento--Roseville--Arden-Arcade, CA 6 California
59917 Sacramento--Roseville--Arden-Arcade, CA 6 California
59918 Sacramento--Roseville--Arden-Arcade, CA 6 California
County.FIPS.Code County Site.Latitude Site.Longitude
59913 113 Yolo 38.66121 -121.7327
59914 113 Yolo 38.66121 -121.7327
59915 113 Yolo 38.66121 -121.7327
59916 113 Yolo 38.66121 -121.7327
59917 113 Yolo 38.66121 -121.7327
59918 113 Yolo 38.66121 -121.7327
names(PM_2022) [1] "Date" "Source"
[3] "Site.ID" "POC"
[5] "Daily.Mean.PM2.5.Concentration" "Units"
[7] "Daily.AQI.Value" "Local.Site.Name"
[9] "Daily.Obs.Count" "Percent.Complete"
[11] "AQS.Parameter.Code" "AQS.Parameter.Description"
[13] "Method.Code" "Method.Description"
[15] "CBSA.Code" "CBSA.Name"
[17] "State.FIPS.Code" "State"
[19] "County.FIPS.Code" "County"
[21] "Site.Latitude" "Site.Longitude"
str(PM_2022)'data.frame': 59918 obs. of 22 variables:
$ Date : chr "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
$ Source : chr "AQS" "AQS" "AQS" "AQS" ...
$ Site.ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
$ POC : int 3 3 3 3 3 3 3 3 3 3 ...
$ Daily.Mean.PM2.5.Concentration: num 12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
$ Units : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
$ Daily.AQI.Value : int 58 60 39 21 23 21 13 38 59 55 ...
$ Local.Site.Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
$ Daily.Obs.Count : int 1 1 1 1 1 1 1 1 1 1 ...
$ Percent.Complete : num 100 100 100 100 100 100 100 100 100 100 ...
$ AQS.Parameter.Code : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
$ AQS.Parameter.Description : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
$ Method.Code : int 170 170 170 170 170 170 170 170 170 170 ...
$ Method.Description : chr "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" ...
$ CBSA.Code : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
$ CBSA.Name : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
$ State.FIPS.Code : int 6 6 6 6 6 6 6 6 6 6 ...
$ State : chr "California" "California" "California" "California" ...
$ County.FIPS.Code : int 1 1 1 1 1 1 1 1 1 1 ...
$ County : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
$ Site.Latitude : num 37.7 37.7 37.7 37.7 37.7 ...
$ Site.Longitude : num -122 -122 -122 -122 -122 ...
#Check distribution for 2002
hist(PM_2002$Daily.Mean.PM2.5.Concentration)plot(density(PM_2002$Daily.Mean.PM2.5.Concentration))#Check distribution for 2022
hist(PM_2022$Daily.Mean.PM2.5.Concentration)plot(density(PM_2022$Daily.Mean.PM2.5.Concentration))Summary: - Dimensions: 15976 rows and 22 columns for 2002. 59918 rows and 22 columns for 2022. - Headers & Footers: Consistent between both datasets - Variable Names and Types: Same variable names across both datasets. Daily Mean PM2.5 Concentration was collected as a continouous variable.
For the distribution of Daily Mean PM 2.5 Concentration during 2002, the histogram distribution is skewed to the right which means that the air quality is relatively good, with some occasional days with very high pollution levels for California. For 2022, the mean PM 2.5 concentration levels are more condensed, suggesting that PM 2.5 levels remained relatively consistent given the one spike in histogram and density plot.
Step 2
PM_2002$Date <- 2002
PM_2022$Date <- 2022
combined_PM <- rbind(PM_2002, PM_2022)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
combined_PM <- combined_PM %>%
rename(
Year = Date,
PM25 = Daily.Mean.PM2.5.Concentration,
State = State,
County = County,
City = Local.Site.Name,
Lat = Site.Latitude,
Lon = Site.Longitude
)Step 3
year_pal <- colorFactor(palette = c("blue", "red"), domain = combined_PM$Year)
leaflet(combined_PM) %>%
addTiles() %>%
addCircleMarkers(~Lon, ~Lat,
color = ~year_pal(Year),
radius = 1,
fillOpacity = 0.6,
popup = ~paste("Year:", Year)) %>%
addLegend("bottomright", pal = year_pal, values = combined_PM$Year, title = "Monitoring Sites")From 2002 to 2002, the distribution of the monitoring sites does change. There is an increase of monitoring sites, which is evident in the increased appearance of red dots depicted on the map. Upon closer observation, in 2002, most of the monitoring sites were placed in bigger cities with only a few placed between the cities. In 2022, this distributions changes with significantly more monitoring sites located in smaller and lesser known cities as well.
Step 4
#Check for missing or implausible values of PM2.5
sum(is.na(combined_PM$PM25))[1] 0
summary(combined_PM$PM25) Min. 1st Qu. Median Mean 3rd Qu. Max.
-6.70 4.40 7.60 10.04 12.20 302.50
#Flag implausible values for PM2.5
combined_PM <- combined_PM %>%
mutate(
issue_flag = ifelse(is.na(PM25) | PM25 < 0 | PM25 > 500, 1, 0)
)
#Calculate proportion of missing/implausible values
issue_summary <- combined_PM %>%
group_by(Year) %>%
summarise(
total_obs = n(),
issues = sum(issue_flag),
prop_issues = issues / total_obs
)
print(issue_summary)# A tibble: 2 × 4
Year total_obs issues prop_issues
<dbl> <int> <dbl> <dbl>
1 2002 15976 0 0
2 2022 59918 215 0.00359
In 2002, there are no missing or implausible values of PM2.5. This suggests that there were little to no problems with the data quality for that year. By 2022, a small proportion of values (0.35%) were flagged as problematic. Although this is a very small fraction, it does suggest a shift from complete data quality to evidence of some issues in 2022. Possible explanations for this pattern change could be the increase in monitoring sites, changes in protocols, or an increased of entry errors in recent years.
Step 5
Level 1: State-Level
# Visualization
ggplot(combined_PM, aes(x = Year, y = PM25, fill = factor(Year))) +
geom_boxplot() +
labs(title = "California PM 2.5 Distribution in 2002 vs. 2022",
x = "Year",
y = "Daily Mean PM2.5") +
scale_fill_manual(values = c("blue", "red"))# Summary
state_summary <- combined_PM %>%
group_by(Year) %>%
summarise(
min_pm25 = min(PM25, na.rm = TRUE),
max_pm25 = max(PM25, na.rm = TRUE),
mean_pm25 = mean(PM25, na.rm = TRUE),
median_pm25 = median(PM25, na.rm = TRUE),
sd_pm25 = sd(PM25, na.rm = TRUE),
n_obs = n()
)
print(state_summary)# A tibble: 2 × 7
Year min_pm25 max_pm25 mean_pm25 median_pm25 sd_pm25 n_obs
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 2002 0 104. 16.1 12 13.9 15976
2 2022 -6.7 302. 8.41 6.8 7.64 59918
State-level Results: At the state-level, there is an improvement distribution of PM2.5 concentration levels from 2002 to 2022. In 2002, the median is closer to the first quartile (Q1), indicating a high concentration of low PM2.5 values. The mean is higher in 2002 as well which means there are unusually high values that are pulling the mean value up. The distribution becomes more narrow in 2022, given the lower mean & median, suggesting reduced variability across monitoring sites statewide. Thus, the boxplots in 2002 compared to 2022 demonstrate an improvement of PM2.5 concentration levels over time.
Level 2: County-Level
# Visualization
ggplot(combined_PM, aes(y = reorder(County, PM25))) +
geom_point(aes(x = PM25, color = factor(Year)), size = 1) +
geom_line(aes(x = PM25, group = County), color = "gray70") +
labs(title = "PM2.5 Levels by County: 2002 vs. 2022",
x = "Daily Mean PM2.5", y = "County", color = "Year")# Summary
county_summary <- combined_PM %>%
group_by(County, Year) %>%
summarise(
n_obs = n(),
mean_pm25 = mean(PM25, na.rm = TRUE),
median_pm25 = median(PM25, na.rm = TRUE),
min_pm25 = min(PM25, na.rm = TRUE),
max_pm25 = max(PM25, na.rm = TRUE),
.groups = "drop")
print(county_summary)# A tibble: 98 × 7
County Year n_obs mean_pm25 median_pm25 min_pm25 max_pm25
<chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 Alameda 2002 201 14.3 10 1.9 61.6
2 Alameda 2022 1793 8.20 7 -0.7 35.5
3 Butte 2002 473 14.8 11.5 1 88
4 Butte 2022 1121 6.19 4.5 -0.6 42.8
5 Calaveras 2002 60 9.9 8 2 40
6 Calaveras 2022 355 6.04 5 0 25.9
7 Colusa 2002 95 11.7 9 1 57
8 Colusa 2022 401 7.61 6.7 0.6 37
9 Contra Costa 2002 276 15.1 9.5 2 76.7
10 Contra Costa 2022 815 8.24 7.2 0.9 37.3
# ℹ 88 more rows
County-Level Results: The dot plot shows mean PM2.5 concentrations for each county in California in 2002 (red) and 2022 (blue). Overall, majority of the counties showed a decline in mean PM2.5 levels over the 20-year period, suggesting improvements in air quality across the state. A few counties display little changes or higher PM2.5 levels in 2022, which could be due to local emissions, wildfires, or measurement differences. The plot highlights both geographical variation and temporal trends, illustrating that while statewide improvements are evident, the magnitude of change varies between counties.
Level 3: City Level
la_data <- combined_PM %>%
filter(County == "Los Angeles")# Visualization
combined_PM <- combined_PM %>%
filter(!is.na(City))
ggplot(la_data, aes(y = reorder(City, combined_PM))) +
geom_point(aes(x = PM25, color = factor(Year)), size = 1) +
geom_line(aes(x = PM25, group = City), color = "gray70") +
labs(x = "Daily Mean PM2.5", y = "City", color = "Year",
title = "Mean PM2.5 by City in LA County: 2002 vs. 2022")Warning in split.default(x = seq_len(nrow(x)), f = f, drop = drop, ...): data
length is not a multiple of split variable
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in split.default(x = seq_len(nrow(x)), f = f, drop = drop, ...): data
length is not a multiple of split variable
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
# Summary
city_summary <- la_data %>%
group_by(City, Year) %>%
summarise(
n_obs = n(),
mean_pm25 = mean(PM25, na.rm = TRUE),
median_pm25 = median(PM25, na.rm = TRUE),
min_pm25 = min(PM25, na.rm = TRUE),
max_pm25 = max(PM25, na.rm = TRUE),
.groups = "drop"
)
print(city_summary)# A tibble: 25 × 7
City Year n_obs mean_pm25 median_pm25 min_pm25 max_pm25
<chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 "" 2002 118 23.9 21.4 5.6 61
2 "Azusa" 2002 339 20.8 18.7 3.1 72.4
3 "Azusa" 2022 76 9.72 9.65 3.1 18.4
4 "Burbank" 2002 122 24.0 21.6 3.5 63
5 "Compton" 2022 723 13.0 11.9 2.6 54.6
6 "Glendora" 2022 365 8.42 7.8 -0.8 56
7 "Lancaster-Division Stre… 2002 107 10.4 10 1 24
8 "Lancaster-Division Stre… 2022 348 7.52 7.3 1.9 15.1
9 "Lebec" 2002 109 4.82 4.8 0.6 12.4
10 "Lebec" 2022 41 3.50 3.4 0.9 7.3
# ℹ 15 more rows
top5_cities <- la_data %>%
group_by(City, Year) %>%
summarise(mean_pm25 = mean(PM25, na.rm = TRUE)) %>%
arrange(desc(mean_pm25)) %>%
slice_head(n = 5)`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.
print(top5_cities)# A tibble: 25 × 3
# Groups: City [18]
City Year mean_pm25
<chr> <dbl> <dbl>
1 "" 2002 23.9
2 "Azusa" 2002 20.8
3 "Azusa" 2022 9.72
4 "Burbank" 2002 24.0
5 "Compton" 2022 13.0
6 "Glendora" 2022 8.42
7 "Lancaster-Division Street" 2002 10.4
8 "Lancaster-Division Street" 2022 7.52
9 "Lebec" 2002 4.82
10 "Lebec" 2022 3.50
# ℹ 15 more rows
City-Level Results: At the city level, the dot plot shows a general decrease in mean PM2.5 concentration levels across almost all monitoring sites in Los Angeles county from 2002 to 2022. Especially in densely populated cities such as Los Angeles, the city experienced a decline by almost half (~21 to ~11 PM2.5 by 2022), suggesting that pollution controls and air quality management efforts have been impactful in urban areas. Although, smaller cities also show improvements by 2022, though the decrease is less obvious in magnitude. Overall, the dot plot highlights both a general downward trend and some variation in the degree of improvement between cities, given differences in local emission sources, geography, and population density.